import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
songs_url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv'
songs = pd.read_csv( songs_url )
songs.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32833 entries, 0 to 32832 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 track_id 32833 non-null object 1 track_name 32828 non-null object 2 track_artist 32828 non-null object 3 track_popularity 32833 non-null int64 4 track_album_id 32833 non-null object 5 track_album_name 32828 non-null object 6 track_album_release_date 32833 non-null object 7 playlist_name 32833 non-null object 8 playlist_id 32833 non-null object 9 playlist_genre 32833 non-null object 10 playlist_subgenre 32833 non-null object 11 danceability 32833 non-null float64 12 energy 32833 non-null float64 13 key 32833 non-null int64 14 loudness 32833 non-null float64 15 mode 32833 non-null int64 16 speechiness 32833 non-null float64 17 acousticness 32833 non-null float64 18 instrumentalness 32833 non-null float64 19 liveness 32833 non-null float64 20 valence 32833 non-null float64 21 tempo 32833 non-null float64 22 duration_ms 32833 non-null int64 dtypes: float64(9), int64(4), object(10) memory usage: 5.8+ MB
Identify the tracks that appear ONCE and ONLY ONCE as discussed in Week 01.
songs.track_id.nunique()
28356
songs.shape[0]
32833
songs.track_id.value_counts()
track_id
7BKLCZ1jbUBVqRi2FVlTVw 10
14sOS5L36385FJ3OL8hew4 9
3eekarcy7kvN4yt5ZFzltW 9
2Fxmhks0bxGSBdJ92vM42m 8
2tnVG71enUj33Ic2nFN6kZ 8
..
3AKwyujeGxTQSZNbx9Ka3c 1
3ITvHA9zhZZdBJsOsAUegF 1
5gsW8TMmNVnevjq13h0Nlp 1
4pN4icvWINm9uN3bElfbec 1
29zWqhca3zt5NsckZqDf6c 1
Name: count, Length: 28356, dtype: int64
songs.track_id.value_counts().value_counts()
count 1 25190 2 2384 3 510 4 142 5 60 6 35 7 17 8 15 9 2 10 1 Name: count, dtype: int64
track_info = songs.groupby(['track_id']).\
aggregate( num_rows = ('track_id', 'size'),
num_artists = ('track_artist', 'nunique'),
num_albums = ('track_album_id', 'nunique'),
num_playlists = ('playlist_id', 'nunique')).\
reset_index()
track_info.sort_values(by=['num_rows'], ascending=False)
| track_id | num_rows | num_artists | num_albums | num_playlists | |
|---|---|---|---|---|---|
| 26040 | 7BKLCZ1jbUBVqRi2FVlTVw | 10 | 1 | 1 | 8 |
| 3867 | 14sOS5L36385FJ3OL8hew4 | 9 | 1 | 1 | 7 |
| 13313 | 3eekarcy7kvN4yt5ZFzltW | 9 | 1 | 1 | 7 |
| 23695 | 6WrI0LAC5M1Rw2MnX2ZvEg | 8 | 1 | 1 | 6 |
| 3023 | 0qaWEvPkts34WF68r8Dzx9 | 8 | 1 | 1 | 5 |
| ... | ... | ... | ... | ... | ... |
| 10004 | 2kR3B09M6KeJnchOkxwszt | 1 | 1 | 1 | 1 |
| 10003 | 2kR09CPFP1E8fpg6WEVwJt | 1 | 1 | 1 | 1 |
| 10002 | 2kQ1WxNv3KKxPYFBgcRSnB | 1 | 1 | 1 | 1 |
| 10001 | 2kOupHwtD01Wle9xOYfD3V | 1 | 1 | 1 | 1 |
| 28355 | 7zzZmpw8L66ZPjH1M6qmOs | 1 | 1 | 1 | 1 |
28356 rows × 5 columns
tracks_info_keep = track_info.loc[ track_info.num_rows == 1, : ].copy()
tracks_info_keep.num_rows.value_counts()
num_rows 1 25190 Name: count, dtype: int64
df = songs.loc[ songs.track_id.isin( tracks_info_keep.track_id ), : ].copy()
df.shape
(25190, 23)
df.nunique()
track_id 25190 track_name 21419 track_artist 10303 track_popularity 91 track_album_id 20366 track_album_name 18078 track_album_release_date 4362 playlist_name 440 playlist_id 462 playlist_genre 6 playlist_subgenre 24 danceability 820 energy 948 key 12 loudness 9879 mode 2 speechiness 1267 acousticness 3682 instrumentalness 4609 liveness 1614 valence 1339 tempo 16334 duration_ms 18257 dtype: int64
The track_popularity column is the OUTPUT. The other numeric columns are the INPUTS. The goal of the this project was to predict the OUTPUT given the inputs! The inputs represent features or characteristics associated with the song! Thus, the goal was to predict the popularity of the song based on its features!
numeric_input_names = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
wf_x = df.loc[ :, numeric_input_names ].copy()
wf_x.info()
<class 'pandas.core.frame.DataFrame'> Index: 25190 entries, 3 to 32832 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 danceability 25190 non-null float64 1 energy 25190 non-null float64 2 loudness 25190 non-null float64 3 speechiness 25190 non-null float64 4 acousticness 25190 non-null float64 5 instrumentalness 25190 non-null float64 6 liveness 25190 non-null float64 7 valence 25190 non-null float64 8 tempo 25190 non-null float64 9 duration_ms 25190 non-null int64 dtypes: float64(9), int64(1) memory usage: 2.1 MB
sns.catplot(data = wf_x, kind='box', aspect=2)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
lf_x = wf_x.reset_index().rename(columns={'index': 'rowid'}).\
melt(id_vars=['rowid'], value_vars=numeric_input_names, ignore_index=True)
lf_x.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 251900 entries, 0 to 251899 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 rowid 251900 non-null int64 1 variable 251900 non-null object 2 value 251900 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 5.8+ MB
sns.displot(data = lf_x, x='value', col='variable', kind='hist',
col_wrap=5,
common_bins=False,
facet_kws={'sharex': False, 'sharey': False})
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Most of the inputs are NOT symmetric!
sns.pairplot(data = wf_x)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
fig, ax = plt.subplots()
sns.heatmap(data = wf_x.corr(numeric_only=True),
vmin=-1, vmax=1, center=0,
cmap='coolwarm',
annot=True, fmt='1.2f',
ax=ax)
plt.show()
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
pca_wflow = Pipeline( steps=[('std_scale', StandardScaler()),
('pca', PCA())] )
pca_wflow_fit = basic_wflow.fit( wf_x )
pca_wflow_fit
Pipeline(steps=[('std_scale', StandardScaler()), ('pca', PCA())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('std_scale', StandardScaler()), ('pca', PCA())])StandardScaler()
PCA()
def my_screeplot( pca_object, figsize_use=None ):
fig, ax = plt.subplots(figsize=figsize_use)
ax.plot( np.arange(len( pca_object.explained_variance_ratio_ ) ) + 1,
pca_object.explained_variance_ratio_,
'bo-' )
ax.set_xlabel('PC')
ax.set_ylabel('Variance explained ratio')
plt.show()
def my_total_variance_explained_plot( pca_object, figsize_use=None ):
fig, ax = plt.subplots(figsize=figsize_use)
ax.plot( np.arange(len(pca_object.explained_variance_ratio_) ) + 1 ,
pca_object.explained_variance_ratio_.cumsum(),
'bo-' )
ax.axhline(y=0.5, color='grey', linestyle='--')
ax.axhline(y=0.8, color='black', linestyle='--')
ax.axhline(y=0.95, color='red', linestyle='--')
ax.set_xlabel('PC')
ax.set_ylabel('Total variance explained')
plt.show()
def my_eigenvalue_plot( pca_object, figsize_use=None ):
fig, ax = plt.subplots(figsize=figsize_use)
ax.plot(np.arange(len( pca_object.explained_variance_ ) ) + 1,
pca_object.explained_variance_,
'bo-')
ax.axhline(y=1, color='grey', linestyle='--')
ax.set_xlabel('PC')
ax.set_ylabel('Eigenvalue')
plt.show()
my_screeplot(pca_wflow_fit.named_steps['pca'])
my_total_variance_explained_plot( pca_wflow_fit.named_steps['pca'])
my_eigenvalue_plot(pca_wflow_fit.named_steps['pca'])
This suggests PCA does NOT help us here!
Makes sense since the correlation plot did NOT reveal a lot of correlation in the data!
Since PCA does NOT seem to help, we will NOT perform PCA!
yj_wflow = Pipeline( steps=[('std_scale', StandardScaler()),
('yj', PowerTransformer(method='yeo-johnson', standardize=True)),
('std_yj', StandardScaler())])
yj_wflow_fit = yj_wflow.fit( wf_x )
yj_wflow_fit
Pipeline(steps=[('std_scale', StandardScaler()), ('yj', PowerTransformer()),
('std_yj', StandardScaler())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('std_scale', StandardScaler()), ('yj', PowerTransformer()),
('std_yj', StandardScaler())])StandardScaler()
PowerTransformer()
StandardScaler()
X_yj = yj_wflow_fit.transform( wf_x )
X_yj.shape
(25190, 10)
wf_yj = pd.DataFrame( X_yj, columns=wf_x.columns.to_list())
sns.catplot(data = wf_yj, kind='box', aspect=2)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
sns.pairplot(data = wf_yj)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
lf_yj = wf_yj.reset_index().rename(columns={'index': 'rowid'}).\
melt(id_vars=['rowid'], value_vars=wf_yj.columns.to_list(), ignore_index=True)
lf_yj.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 251900 entries, 0 to 251899 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 rowid 251900 non-null int64 1 variable 251900 non-null object 2 value 251900 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 5.8+ MB
sns.displot(data = lf_yj, x='value', col='variable', kind='hist',
col_wrap=5,
common_bins=False,
facet_kws={'sharex': False, 'sharey': False})
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Again, do NOT apply PCA since the correlation is mostly very low.
qt_wflow = Pipeline( steps=[('std_scale', StandardScaler()),
('qt', QuantileTransformer(output_distribution='normal')),
('std_qt', StandardScaler())])
qt_wflow_fit = qt_wflow.fit( wf_x )
X_qt = qt_wflow_fit.transform( wf_x )
wf_qt = pd.DataFrame( X_qt, columns=wf_x.columns.to_list())
sns.catplot(data = wf_qt, kind='box', aspect=2)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
sns.pairplot(data = wf_qt)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
What's going on with instrumentalness?
sns.displot(data = wf_qt, x='instrumentalness', kind='hist')
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Looks like values are stacking up!!
wf_qt.loc[ wf_qt.instrumentalness < -1, : ].shape[0] / wf_qt.shape[0]
0.3529972211194919
Go back to the original raw variable!
sns.displot(data = wf_x, x='instrumentalness', kind='hist', bins=11)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
This variable has a very strange distribution! Nearly 84% of the observations are less than 0.1!
wf_x.loc[ wf_x.instrumentalness < 0.1, :].shape[0] / wf_x.shape[0]
0.8392219134577213
The PREPROCESSING transformations made the strange character of this variable easier to identify!!!!
Maybe we need to rethink how to handle this variable...perhaps it could be treated as a CATEGORICAL variable!
Examine the marginal distributions of the rest of the transformed variables.
lf_qt = wf_qt.reset_index().rename(columns={'index': 'rowid'}).\
melt(id_vars=['rowid'], value_vars=wf_qt.columns.to_list(), ignore_index=True)
sns.displot(data = lf_qt, x='value', col='variable', kind='hist',
col_wrap=5,
common_bins=False,
facet_kws={'sharex': False, 'sharey': False})
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)